Abstract
Here we will be exploring data set for the flu project (a collaboration with the Ingber lab). The data corresponds to an analyses of transcriptomics data collected from one healthy donor lung cells, with cells being cultured in transwells (no endothelial cells.) Additionally, we will be looking at publicly available data for identification of potential host biomarkers for influenza infection. RNA-seq data was collected by the DFCI transcriptomics core.# load data
flu_data <- load_data()
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene_ID = col_character()
## )
## See spec(...) for full column specifications.
Here we will perform a set of data clean up steps. These will include:
count_data <- flu_data$count_data
gene_ids <- flu_data$gene_ids
sample_data <- flu_data$metadata
# remove gene ids with multiple entrez mappings
gkey <- AnnotationDbi::select(x = org.Hs.eg.db,
keys = gene_ids,
columns = "ENTREZID",
keytype = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
# this basically looks for which genes appear more than once (i.e., have more than one entrez ID per gene ID)
nix <- names(which(sapply(gene_ids, function(y) length(which(gkey$SYMBOL == y))) != 1))
if(length(nix) != 0) {
for (i in seq(1, length(nix))) {
a1 <- which(gkey$SYMBOL == nix[i])
a2 <- which.max(diag(var(t(as.matrix(count_data[a1, ])))))
a3 <- a1[setdiff(seq(1, length(a1)), a2)]
count_data[a3, ] <- 0
}
}
# find genes with no counts across all samples
nix <- which(rowSums(count_data) == 0)
if(length(nix) != 0) {
gene_ids <- gene_ids[-nix]
count_data <- count_data[-nix, ]
}
# get annotations
gkey <- AnnotationDbi::mapIds(x = org.Hs.eg.db,
keys = gene_ids,
column = "ENTREZID",
keytype = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
gene_df <- tibble::tibble(gene_symbol = gene_ids,
entrez_id = gkey)
# remove genes with NA entrez id
nix <- which(is.na(gene_df$entrez_id))
if(length(nix) != 0) {
gene_df <- gene_df[-nix, ]
count_data <- count_data[-nix, ]
}
# frequency across samples
# minimum 5 read counts, at least in 75% of the samples
x1 <- apply(count_data, 1, function(y) length(which(y >= 5)))
x1 <- which(x1 >= round(0.75 * ncol(count_data)))
count_data <- count_data[x1, ]
gene_df <- gene_df[x1, ]
We will use UMAP for the visualization of the count data.
tmp_data <- t(count_data)
X <- umap::umap(tmp_data)
Y <- tibble::tibble(x = X$layout[, 1],
y = X$layout[, 2],
condition = sample_data$condition,
time_point = sample_data$time,
disp_id = sample_data$condition_group_time)
p <- Y %>%
ggplot() +
geom_point(aes(x = x, y = y, color = disp_id), size = 4, alpha = 0.5) +
# scale_color_viridis_d() +
labs(x = "Dimension 1", y = "Dimension 2") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 12, color = "black"),
axis.title = element_text(size = 12, color = "black"),
axis.ticks.length = unit(0.2, "cm"))
ggsave(plot = p,
filename = here::here(paste0("res/dataset_2/", format(Sys.Date(), "%Y-%m-%d"), "_UMAP-samples.tiff")),
device = "tiff",
width = 11,
height = 8,
dpi = 600)
p
With the data processed as described in the steps above, we can now proceed with standard analyses such as calculating the differential expression for the genes given the conditions we have. For the differential expression we will use the DESeq2 package, as this is the state-of-the-art tool for differential analyses of RNA-seq data.
dds_copd_nhb <- DESeqDataSetFromMatrix(countData = count_data,
colData = sample_data,
design = ~ condition_group_time)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_copd_nhb <- DESeq(object = dds_copd_nhb)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 3 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
counts_normalized <- counts(dds_copd_nhb,
normalized = TRUE) # normalize counts using sizeFactor; useful for GSEA_GAGE
This establishes the DESeq2 object that we can now query with different contrasts. For the purpose here we will perform the following comparisons:
contrast_groups <- tibble::tibble(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
nid = c("copd_nhb_ctr_4",
"copd_infected_4",
"nhb_infected_4",
"copd_nhb_infected_4",
"copd_nhb_ctr_18",
"copd_infected_18",
"nhb_infected_18",
"copd_nhb_infected_18",
"nhb_polyic_18",
"copd_polyic_18"),
name = c("COPD control vs NHB control",
"COPD infected vs COPD control",
"NHB infected vs NHB control",
"COPD infected vs NHB infected",
"COPD control vs NHB control",
"COPD infected vs COPD control",
"NHB infected vs NHB control",
"COPD infected vs NHB infected",
"NHB PolyIC vs NHB control",
"COPD PolyIC vs COPD control"),
time_point = c(4, 4, 4, 4, 18, 18, 18, 18, 18, 18),
virus = c(0, 1, 1, 1, 0, 1, 1, 1, 0, 0),
polyic = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 1),
var1 = c("COPD_control_4",
"COPD_virus_4",
"NHB_virus_4",
"COPD_virus_4",
"COPD_control_18",
"COPD_virus_18",
"NHB_virus_18",
"COPD_virus_18",
"NHB_poly_ic_18",
"COPD_poly_ic_18"),
var2 = c("NHB_control_4",
"COPD_control_4",
"NHB_control_4",
"NHB_virus_4",
"NHB_control_18",
"COPD_control_18",
"NHB_control_18",
"NHB_virus_18",
"NHB_control_18",
"COPD_control_18"),
deseq_name = rep("condition_group_time", 10))
deseq_res <- vector(mode = "list", length = nrow(contrast_groups))
for (i in seq(1, length(deseq_res))) {
cont <- c(contrast_groups$deseq_name[i],
contrast_groups$var1[i],
contrast_groups$var2[i])
res <- results(dds_copd_nhb,
contrast = cont,
pAdjustMethod = "fdr",
cooksCutoff = FALSE)
deseq_res[[i]] <- res %>%
as_tibble() %>%
dplyr::select(., baseMean, log2FoldChange, padj) %>%
tibble::add_column(., comparison_id = contrast_groups$id[i]) %>%
tibble::add_column(., comparison_nid = contrast_groups$nid[i]) %>%
tibble::add_column(., comparison_name = contrast_groups$name[i]) %>%
tibble::add_column(., time_point = contrast_groups$time_point[i]) %>%
tibble::add_column(., virus = contrast_groups$virus[i]) %>%
tibble::add_column(., poluy_ic = contrast_groups$polyic[i]) %>%
tibble::add_column(., gene_symbol = gene_df$gene_symbol) %>%
tibble::add_column(., entrez_id = gene_df$entrez_id)
}
deseq_res <- dplyr::bind_rows(deseq_res)
deseq_res %>% tibble::add_column(., sig = 0) %>% dplyr::mutate(., sig = replace(sig, abs(log2FoldChange) > 1 & padj < 0.01, 1))
We can look at what these comparisons look like using an MA-plot or a volcano plot:
p1 <- deseq_res %>%
tibble::add_column(., sig = 0) %>%
dplyr::mutate(., sig = replace(sig, abs(log2FoldChange) > 1 & padj < 0.01, 1)) %>%
ggplot() +
geom_point(aes(y = log2FoldChange, x = log10(baseMean), color = as.factor(sig), gene = gene_symbol, entrez = entrez_id, fold_change = log2FoldChange, base_mean = log10(baseMean)), alpha = 0.5, size = 3) +
# ylim(c(-max(abs(res_v18$log2FoldChange)), max(abs(res_v18$log2FoldChange)))) +
# scale_color_viridis_d() +
scale_color_manual(breaks = c(0, 1), name = NULL, values = c("black", "red"), labels = c("Not significant", "|F| > 1, p < 0.01")) +
labs(y = "log(fold change)", x = "Mean normalized counts", title = "MA plot") +
facet_grid(time_point ~ comparison_name) +
theme_bw() +
theme(axis.text = element_text(size = 12, color = "black"),
axis.title = element_text(size = 12, color = "black"),
axis.ticks.length = unit(0.2, "cm"),
strip.text = element_text(size = 12, color = "black"),
legend.position = "top")
## Warning: Ignoring unknown aesthetics: gene, entrez, fold_change, base_mean
ggsave(plot = p1, filename = here::here(paste0("res/dataset_2/", format(Sys.Date(), "%Y-%m-%d"), "_MAplot.tiff")), device = "tiff", width = 11, height = 8, dpi = 600)
p2 <- deseq_res %>%
tibble::add_column(., sig = 0) %>%
dplyr::mutate(., sig = replace(sig, abs(log2FoldChange) > 1 & padj < 0.01, 1)) %>%
ggplot() +
geom_point(aes(x = log2FoldChange, y = -log10(padj), color = as.factor(sig), gene = gene_symbol, entrez = entrez_id, fold_change = log2FoldChange, padj = -log10(padj)), alpha = 0.5, size = 3) +
# ylim(c(0, max(-log10(res_v18$padj)))) +
# xlim(c(-max(abs(res_v18$log2FoldChange)), max(abs(res_v18$log2FoldChange)))) +
# scale_color_viridis_d() +
scale_color_manual(breaks = c(0, 1), name = NULL, values = c("black", "red"), labels = c("Not significant", "|F| > 1, p < 0.01")) +
labs(y = "log(fold change)", x = "Mean normalized counts", title = "Volcano plot") +
facet_grid(time_point ~ comparison_name) +
theme_bw() +
theme(axis.text = element_text(size = 12, color = "black"),
axis.title = element_text(size = 12, color = "black"),
axis.ticks.length = unit(0.2, "cm"),
strip.text = element_text(size = 8, color = "black"),
legend.position = "top")
## Warning: Ignoring unknown aesthetics: gene, entrez, fold_change, padj
ggsave(plot = p2,
filename = here::here(paste0("res/dataset_2/", format(Sys.Date(), "%Y-%m-%d"), "_volcano-plot.tiff")),
device = "tiff",
width = 11,
height = 8,
dpi = 600)
## Warning: Removed 282 rows containing missing values (geom_point).
p1
p2
## Warning: Removed 282 rows containing missing values (geom_point).
# # now interactive plots
# plotly::ggplotly(p1, tooltip = c("gene", "entrez_id", "fold_change", "base_mean"))
#
# plotly::ggplotly(p2, tooltip = c("gene", "entrez_id", "fold_change", "padj"))
From these plots we see that the infection with influenza virus only shows effect at the transcriptional levels at 18h, unless we are comparing healthy and diseased individuals. For that reason, I will focus the remainder of the study on the effects of the virus at the 18h time point.
Before that, though, we can define the transcriptional signature for COPD, when compared with healthy donors. This corresponds to the COPD control vs NHB control plots shown above.
# subset data
copd_sig <- deseq_res %>%
dplyr::filter(., comparison_nid == "copd_nhb_ctr_4" | comparison_nid == "copd_nhb_ctr_18")
copd_4 <- copd_sig %>%
dplyr::filter(., time_point == 4)
copd_18 <- copd_sig %>%
dplyr::filter(., time_point == 18)
From these analyses we get that there are 1896 genes that are differentially expressed at 4h and 1255 genes that are differentially expressed at 18h.
Based on these results, it is conceivable that we can identify chemical compounds for COPD therapeutics, generally speaking. Though not the focus of this work, we will run thes analyses using DRUID on both the 4h and the 18h time points.
a1 <- copd_4 %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
a2 <- copd_18 %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
# druid parameters
res <- vector(mode = "list", length = 5)
for (i in seq(1, 5)) {
res[[i]] <- DRUID::run_druid(dge_matrix = as.matrix(a1[, c(2, 3)]),
druid_direction = "neg",
fold_thr = 0,
pvalue_thr = 0.05,
entrez = as.matrix(a1[, 1]),
num_random = 10000,
selection = i)
}
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
copd_druid_4 <- dplyr::bind_rows(res)
rm(res)
res <- vector(mode = "list", length = 5)
for (i in seq(1, 5)) {
res[[i]] <- DRUID::run_druid(dge_matrix = as.matrix(a2[, c(2, 3)]),
druid_direction = "neg",
fold_thr = 0,
pvalue_thr = 0.05,
entrez = as.matrix(a2[, 1]),
num_random = 10000,
selection = i)
}
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
copd_druid_18 <- dplyr::bind_rows(res)
rm(res)
From these results we can explore compounds that could be relevant in the treatment of COPD as a whole. Taking the results at the 4h and 18h time points, we can find commonalities in the identified compounds.
a1 <- copd_druid_4 %>% dplyr::arrange(., desc(druid_score)) %>% dplyr::slice(1:100)
a2 <- copd_druid_18 %>% dplyr::arrange(., desc(druid_score)) %>% dplyr::slice(1:100)
a3 <- sort(intersect(a1$drug_name, a2$drug_name))
a3
## [1] "AZD-7762" "BRD-A16820783" "BRD-A93975555" "BRD-K18861610"
## [5] "BRD-K22546959" "BRD-K27484191" "BRD-K49712247" "BRD-K71265179"
## [9] "BRD-K75308990" "clobenpropit" "doxorubicin" "emetine"
## [13] "estradiol" "fenbendazole" "fulvestrant" "GW-843682X"
## [17] "imatinib" "inhibitor-BEC" "irinotecan" "LY-2183240"
## [21] "methotrexate" "oxfendazole" "paclitaxel" "parbendazole"
## [25] "PF-477736" "SB-225002" "SKF-96365" "tozasertib"
## [29] "tretinoin" "vitamin c" "VU-0365117-1" "VU-0365118-1"
Going back to the focus of this work, we will go through the transcriptomics data to make predictions on sets of compounds that can be used to improve the reponse of a host to influenza infection, using our algorithm DRUID.
a1 <- deseq_res %>%
dplyr::filter(., comparison_nid == "copd_infected_18") %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
nix <- deseq_res %>%
dplyr::filter(., comparison_nid == "copd_polyic_18") %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
a1 <- a1[-which(a1$entrez_id %in% nix$entrez_id), ]
# druid parameters
res <- vector(mode = "list", length = 5)
for (i in seq(1, 5)) {
res[[i]] <- DRUID::run_druid(dge_matrix = as.matrix(a1[, c(2, 3)]),
druid_direction = "neg",
fold_thr = 0,
pvalue_thr = 0.05,
entrez = as.matrix(a1[, 1]),
num_random = 10000,
selection = i)
}
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
copd_druid_18 <- dplyr::bind_rows(res)
rm(res)
copd_druid_18 %>%
dplyr::select(., -data_source, -cosine_similarity, -probability_random, -number_matches) %>%
dplyr::arrange(desc(druid_score))
With these results in hand, we can now look at the effect of influenza virus on normal donors doing the same analyses:
a1 <- deseq_res %>%
dplyr::filter(., comparison_nid == "nhb_infected_18") %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
nix <- deseq_res %>%
dplyr::filter(., comparison_nid == "nhb_polyic_18") %>%
dplyr::filter(., padj < 0.05, abs(log2FoldChange) > 1) %>%
dplyr::select(., entrez_id, log2FoldChange, padj)
a1 <- a1[-which(a1$entrez_id %in% nix$entrez_id), ]
# druid parameters
res <- vector(mode = "list", length = 5)
for (i in seq(1, 5)) {
res[[i]] <- DRUID::run_druid(dge_matrix = as.matrix(a1[, c(2, 3)]),
druid_direction = "neg",
fold_thr = 0,
pvalue_thr = 0.05,
entrez = as.matrix(a1[, 1]),
num_random = 10000,
selection = i)
}
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
## Generating query vector...
## Computing cosine similarity on query vector...
## Computing cosine similarity on random vectors...
## Computing cosine similarity on random vectors...
## Building results data frame...
nhb_druid_18 <- dplyr::bind_rows(res)
rm(res)
nhb_druid_18 %>%
dplyr::select(., -data_source, -cosine_similarity, -probability_random, -number_matches) %>%
dplyr::arrange(desc(druid_score))
We can visualize these results in both donor classes easily:
a1 <- copd_druid_18 %>%
dplyr::arrange(desc(druid_score)) %>%
dplyr::slice(1:20) %>%
dplyr::mutate(., comp = "copd")
a2 <- nhb_druid_18 %>%
dplyr::arrange(desc(druid_score)) %>%
dplyr::slice(1:20) %>%
dplyr::mutate(., comp = "nhb")
# a3 <- copd_nhb_druid_18 %>%
# dplyr::arrange(desc(druid_score)) %>%
# dplyr::slice(1:20) %>%
# dplyr::mutate(., comp = "copd_nhb")
druid_res_1 <- dplyr::bind_rows(a1, a2)
p2 <- druid_res_1 %>%
ggplot() +
geom_point(aes(x = druid_score, y = drug_name, color = comp), size = 4, alpha = 0.5) +
scale_color_viridis_d() +
labs(x = "DRUID score", y = NULL) +
theme_bw() +
theme(axis.text = element_text(size = 12, color = "black"),
axis.text.x = element_blank(),
axis.title.y = element_blank())
ggsave(plot = p2,
filename = here::here(paste0("res/dataset_2/", format(Sys.Date(), "%Y-%m-%d"), "_DRUID-RESULTS-2class.tiff")),
device = "tiff",
width = 11,
height = 8,
dpi = 600)
p2
druid_res_1 %>%
dplyr::select(., -data_source, -cosine_similarity, -probability_random, -number_matches) %>%
dplyr::arrange(desc(druid_score))
Additionally, we can use publicly available data to assess which genes are preferentially expressed in a given tissue. For this particular problem, we are interested in identifying the genes that are mostly expressed in lung. For that, we will use the human expression atlas data that is housed at BioGPS. We can also use data from GTex, though this is not implemented yet (will include this in future work.)
The tissue specificity is calculated using an adaptation of Stouffer’s method. Briefly, we normalize the expression data along the gene and the sample spaces, independently, and we combine the normalized scores into a single representation given by:
$ Z_{global} = $
where \(Z_{global}\) is the tissue specificity score, \(Z_{genes}\) corresponds to the normalized data along the gene space (normalizing along the columns), and \(Z_{samples}\) is the normalized data along the sample space (normalizing along the rows).
First, we will compute the average expression of any given gene for any given tissue.
# biogps data ----
load(here::here("data/2019-02-13_biogps.RData"))
# average of tissues ----
utis <- unique(biogps_data$tissues)
avg_E <- matrix(0, nrow = nrow(biogps_data$expression), ncol = length(utis))
for (i in seq(1, length(utis))) {
u1 <- which(biogps_data$tissues == utis[i])
if (length(u1) > 1) {
avg_E[, i] <- rowMeans(biogps_data$expression[, u1])
} else {
avg_E[, i] <- biogps_data$expression[, u1]
}
}
With these data we can now computer the tissue specificity:
# lung specific genes ----
stouf_thr <- 3
ts_stouffer <- stouffer_specificity(avg_E)
As we can see, the dimensions of the (average) expression matrix (ncols = 29; nrows = 7937) and the tissue specificity matrix (ncols = 29; nrows = 7937) are exactly the same, which is what we wanted.
Using the transformation above, we will look for genes that are primarily expressed in the lung.
B <- ts_stouffer
B[which(B < stouf_thr)] <- 0
B[B != 0] <- 1
num_tissues <- rowSums(B)
spec_genes <- which(num_tissues <= 2 & num_tissues != 0) # <-- gives at most 2 tissues with high specificity for a given gene
# now make a data frame for these:
specificity_df <- tibble::tibble(gene_symbol = rep(biogps_data$genes$SYMBOL[spec_genes], length(utis)),
entrez_id = rep(biogps_data$genes$ENTREZID[spec_genes], length(utis)),
tissue_name = as.vector(sapply(utis, rep, length(spec_genes))),
presence_call = as.vector(B[spec_genes, ]))
specificity_df <- specificity_df %>%
dplyr::filter(., presence_call != 0)
lung_genes <- specificity_df %>%
dplyr::filter(., tissue_name == "lung") %>%
dplyr::select(., -presence_call)
lung_genes
Looking at our data set we can see that there are 1 genes that are lung specific and differentially expressed in our experiments. These are TNFSF10.